It's a notebook and it's available here: Github/maigimenez/ep2016_vect4word
Schler, J., Koppel, M., Argamon, S., & Pennebaker, J. W. (2006, March). Effects of Age and Gender on Blogging. In AAAI Spring Symposium: Computational Approaches to Analyzing Weblogs (Vol. 6, pp. 199-205). read the paper
Feature | Male | Female |
---|---|---|
linux | 0.53 ± 0.04 | 0.03 ± 0.01 |
india | 0.62 ± 0.04 | 0.15 ± 0.01 |
programming | 0.36 ± 0.02 | 0.08 ± 0.01 |
0.90 ± 0.04 | 0.19 ± 0.02 | |
software | 0.99 ± 0.05 | 0.17 ± 0.02 |
shopping | 0.66 ± 0.02 | 1.48 ± 0.03 |
mom | 2.07 ± 0.05 | 4.69 ± 0.08 |
cried | 0.31 ± 0.01 | 0.72 ± 0.02 |
freaked | 0.08 ± 0.01 | 0.21 ± 0.01 |
cute | 0.83 ± 0.03 | 2.32 ± 0.04 |
Word frequency (per 10000 words) and standard error by gender
In [1]:
from configparser import ConfigParser
from os.path import join
from os import pardir
In [2]:
config = ConfigParser()
config.read(join(pardir,'src','credentials.ini'))
APP_KEY = config['twitter']['app_key']
APP_SECRET = config['twitter']['app_secret']
OAUTH_TOKEN = config['twitter']['oauth_token']
OAUTH_TOKEN_SECRET = config['twitter']['oauth_token_secret']
In [3]:
from twitter import oauth, Twitter, TwitterHTTPError
In [ ]:
auth = oauth.OAuth(OAUTH_TOKEN, OAUTH_TOKEN_SECRET,
APP_KEY, APP_SECRET)
twitter_api = Twitter(auth=auth)
twitter_api.retry = True
Full disclaimer: gender is non a binary issue. This is just a simplified example. If you are willing to expand this experiment, go ahead and contact me!
In [ ]:
brogrammers = ['jakevdp', 'rasbt', 'GaelVaroquaux', 'amuellerml', 'fperez_org',
'fpedregosa', 'ogrisel', 'dontusethiscode', 'randal_olson', 'tdhopper' ]
sisgrammers = ['pkafei', 'LorenaABarba', 'jessicamckellar', 'heddle317', 'diana_clarke',
'wholemilk', 'spang', 'cecilycarver', 'juliaelman', 'b0rk']
brotweets = []
for bro in brogrammers:
brotweets.extend(twitter_api.statuses.user_timeline(screen_name=bro, count=100))
sistweets = []
for sis in sisgrammers:
sistweets.extend(twitter_api.statuses.user_timeline(screen_name=sis, count=100))
In [ ]:
import re
def clean_tweet(tweet):
""" Simplest preprocess.
Convert a tweet to lowercarse and replace URLs and @username by a generic token
Args:
tweet (str): Tweet to clean.
Returns:
str: Preprocessed tweet
"""
tweet = tweet.lower()
# Remove URL and replace them with a token
URL_REGEX = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
tweet = re.sub(URL_REGEX, '<url>', tweet, flags=re.MULTILINE)
# Remove usernames and replace them with a token
tweet = re.sub("@([A-Za-z0-9_]+)", "<user>", tweet)
# Remove repeated spaces
tweet = re.sub(r"\s{2,}", " ", tweet)
# If a character is repeated more than 4 time, keep only 3 repetitions.
tweet = re.sub(r'(.)\1{4,}', r'\1\1\1', tweet)
return tweet
In [ ]:
import pandas as pd
dataset = []
# Gather the text
for tweet in brotweets:
cleaned_tweet = clean_tweet(tweet['text'])
dataset.append({'id': tweet['id'], 'text': cleaned_tweet, 'class': 0})
for tweet in sistweets:
cleaned_tweet = clean_tweet(tweet['text'])
dataset.append({'id': tweet['id'], 'text': cleaned_tweet, 'class': 1})
pd_dataset = pd.DataFrame(dataset)
In [ ]:
pd_dataset.head()
In [ ]:
pd_dataset.to_csv('../corpora/full_dataset.csv')
In [ ]:
pd_dataset[['class', 'id']].to_csv('../corpora/ep16.csv')
In [4]:
import pandas as pd
DATASET_PATH = "../corpora/full_dataset.csv"
pd_dataset = pd.DataFrame.from_csv(DATASET_PATH)
pd_dataset.head()
Out[4]:
In [5]:
import nltk.data
#nltk.download()
In [6]:
from nltk.tokenize import TweetTokenizer, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import re
import scipy.stats as stats
In [7]:
print(', '.join(stopwords.words('english')[:20]))
In [8]:
def get_vocabulary(corpus, tokenizer):
""" Get the vocabulary of a dataset.
Get a vocabulary of a set of tweets after removing stopwords, non letters,
and replacing each number by the token <number>
Args:
corpus (list of tweets): A list of tweets.
tokenizer (function): tokenizer function. To get the tokens of each tweet.
Returns:
Counter: Vocabulary with the frequency of each word in it.
"""
stop_words = stopwords.words('english')
# Remove puntuation marks
no_punks = [re.sub(r'\W', ' ', tweet) for tweet in corpus]
# Tokenize and remove stop words
clean_tokens = []
for tweet in no_punks:
# Replace different numbers with a token
tweet = re.sub(r"\.\d+\s*", ".<number> ", tweet)
tweet = re.sub(r"\d+\s*", " <number> ", tweet)
tokens = tokenizer(tweet)
tokens = [token for token in tokens if token not in stop_words]
clean_tokens.extend(tokens)
# Build the vocabulary
return Counter(clean_tokens)
In [9]:
tknzr = TweetTokenizer()
brotweets = pd_dataset[pd_dataset['class'] == 0]['text'].tolist()
sistweets = pd_dataset[pd_dataset['class'] == 1]['text'].tolist()
brocabulary = get_vocabulary(brotweets, tknzr.tokenize)
siscabulary = get_vocabulary(sistweets, tknzr.tokenize)
In [10]:
brocabulary.most_common(10)
Out[10]:
In [11]:
siscabulary.most_common(10)
Out[11]:
In [12]:
from bokeh.plotting import figure, show, vplot, ColumnDataSource
from bokeh.io import output_notebook
from bokeh.models import HoverTool
output_notebook()
In [13]:
MOST_COMMON = 50
mc_brocavulary = brocabulary.most_common(int(MOST_COMMON/2))
mc_siscavulary = siscabulary.most_common(int(MOST_COMMON/2))
fr_brocavulary, fr_siscavulary = [], []
most_common_words = mc_brocavulary + mc_siscavulary
words = list(set(word for word, _ in most_common_words))
for word in words:
if word in brocabulary:
fr_brocavulary.append(brocabulary[word])
else:
fr_brocavulary.append(0)
if word in siscabulary:
fr_siscavulary.append(siscabulary[word])
else:
fr_siscavulary.append(0)
In [14]:
import numpy as np
range_words=list(range(1,len(words)+1))
source = ColumnDataSource(data=dict(range_words=range_words,
words=words,
freq_true=fr_brocavulary,
freq_false=fr_siscavulary))
hover = HoverTool()
hover.point_policy = "follow_mouse"
hover = HoverTool(
tooltips=[
("words", "@words"),
]
)
TOOLS="pan,wheel_zoom,box_zoom,reset,save"
p = figure(title = "Vocabulary gender", x_range=words, tools=[TOOLS, hover])
p.xaxis.axis_label = 'Words'
p.yaxis.axis_label = 'Frequency'
p.circle('range_words', 'freq_true', source=source, fill_alpha=0.2, size=10, color="navy")
p.circle('range_words', 'freq_false', source=source, fill_alpha=0.2, size=10, color='red')
p.xaxis.major_label_orientation = np.pi/4
show(p)
Out[14]:
In [15]:
tweet_lens_bro = [len(tweet) for tweet in brotweets]
hist_bro, edges_bro = np.histogram(tweet_lens_bro, density=True, bins=20)
tweet_lens_bro.sort()
tweet_lens_sis = [len(tweet) for tweet in sistweets]
hist_sis, edges_sis = np.histogram(tweet_lens_sis, density=True, bins=20)
tweet_lens_sis.sort()
p = figure(title="")
p.quad(top=hist_bro, bottom=0, left=edges_bro[:-1], right=edges_bro[1:],
fill_color="navy", line_color="#033649", fill_alpha=0.3)
p.quad(top=hist_sis, bottom=0, left=edges_sis[:-1], right=edges_sis[1:],
fill_color="red", line_color="#033649", fill_alpha=0.3)
sigma = np.std(tweet_lens_bro)
mu = np.mean(tweet_lens_bro)
pdf = stats.norm.pdf(tweet_lens_bro, mu, sigma)
p.line(tweet_lens_bro, pdf, line_color="navy", line_width=6, alpha=0.7, legend="PDF")
sigma = np.std(tweet_lens_sis)
mu = np.mean(tweet_lens_sis)
pdf = stats.norm.pdf(tweet_lens_sis, mu, sigma)
p.line(tweet_lens_sis, pdf, line_color="red", line_width=6, alpha=0.7, legend="PDF")
p.xaxis.axis_label = 'len(tweets)'
p.yaxis.axis_label = '# tweets'
show(p)
Out[15]:
In [16]:
from sklearn import cross_validation
X = pd_dataset['text'].tolist()
y = pd_dataset['class'].tolist()
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.4, random_state=0)
print("Examples in train: {}".format(len(X_train)))
print("Examples in test: {}".format(len(X_test)))
dataset = ["I love Python", "I love NLP", "Pyladies are cool"]
vocabulary = set(["I", "love", "Python", "NLP", "Pyladies", "are", "cool"])
dataset_representation = [[1,1,1,0,0,0,0],
[1,1,0,1,0,0,0],
[0,0,0,0,1,1,1]]
Ok, that looks cool. This look solved. Let's party!
hotel = [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
motel = [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
w hotel AN D w motel = 0
In [17]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = "word",
tokenizer = None,
preprocessor = None,
stop_words = None,
ngram_range=(1, 1),
max_features = 5000)
# Fit the train
BOW_train = vectorizer.fit_transform(X_train)
BOW_train = BOW_train.toarray()
# Transform the test
BOW_test = vectorizer.transform(X_test)
BOW_test = BOW_test.toarray()
print('Train: {{0|1}}^({}x{})'.format(BOW_train.shape[0], BOW_train.shape[1]))
print('Test: {{0|1}}^({}x{})'.format(BOW_test.shape[0], BOW_test.shape[1]))
vocab = vectorizer.get_feature_names()
print('\nVOCABULARY EXTRACT: {}'.format(', '.join(vocab[500:600])))
np.set_printoptions(threshold=np.nan)
print('\nTWEET REPRESENTATION: {}'.format(BOW_train[0]))
You shall know a word by the company it keeps.
-- J.R. Firth 1957:11
Ok, we need to talk.
Mikolov, T., Sutskever, I., Chen, K., Corrado, G. S., & Dean, J. (2013). Distributed representations of words and phrases and their compositionality. In Advances in neural information processing systems (pp. 3111-3119).
Now, we can go on.
Two algorithms are implemented:
These algorithms are trained using:
Pennington, J., Socher, R., & Manning, C. D. (2014, October). Glove: Global Vectors for Word Representation. In EMNLP (Vol. 14, pp. 1532-43).
GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.
In [18]:
from os.path import join
In [19]:
import numpy as np
def load_glove_dict(glove_filepath):
""" Build a dictionary with GloVe values.
Read the GloVe resource and build a dictionary where the key is the word
and the value its GloVe representation
Args:
glove_filepath (str): Path where the GloVe resource is.
Returns:
dict: Dictionary with GloVe data.
"""
glove_embeddings = {}
# TODO: check if the filepath exists
with open(glove_filepath) as glove_file:
for line in glove_file:
split_line = line.split()
word, vector = split_line[0], np.asarray(split_line[1:])
glove_embeddings[word] = vector
return glove_embeddings
In [20]:
GLOVE_PATH = '../../../resources/GloVe/twitter_dataset'
embedding_size = '25'
glove_file = join(GLOVE_PATH, 'glove.twitter.27B.' + embedding_size + 'd.txt')
glove_25 = load_glove_dict(glove_file)
In [21]:
embedding_size = '100'
glove_file = join(GLOVE_PATH, 'glove.twitter.27B.' + embedding_size + 'd.txt')
glove_100 = load_glove_dict(glove_file)
In [23]:
def get_most_common_vocab(most_common, vocabulary):
""" Get the most common words in a vocabulary
Args:
most_common (int): Number of most common word that want to be retrieved.
vocabulary (Counter): Vocabulary with words and frequencies of each word.
Returns:
set: set of most common words in this vocabulary.
"""
most_common_words = vocabulary.most_common(int(most_common))
return set(word for word, _ in most_common_words)
def get_words_to_plot(most_common, vocabulary, dictionary):
words_to_plot = {}
unseen_words = []
for word in get_most_common_vocab(most_common, vocabulary):
if word in dictionary:
words_to_plot[word] = dictionary[word]
else:
unseen_words.append(word)
return words_to_plot, unseen_words
In [24]:
from sklearn.manifold import TSNE
def plot_tsne(dictionary, most_common):
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
words_to_plot_bros, unseen_words_bros = get_words_to_plot(most_common, brocabulary, dictionary)
words_to_plot_sis, unseen_words_sis = get_words_to_plot(most_common, siscabulary, dictionary)
low_dim_embs_bros = tsne.fit_transform(list(words_to_plot_bros.values()))
low_dim_embs_sis = tsne.fit_transform(list(words_to_plot_sis.values()))
words_bros = list(words_to_plot_bros.keys())
range_words_bros=list(range(1,len(words_bros)+1))
source_bros = ColumnDataSource(data=dict(range_words=range_words_bros,
words_bros=words_bros,
x=low_dim_embs_bros[:,0],
y=low_dim_embs_bros[:,1]))
words_sis = list(words_to_plot_sis.keys())
range_words_sis = list(range(1,len(words_sis)+1))
source_sis = ColumnDataSource(data=dict(range_words=range_words_sis,
words_sis=words_sis,
x=low_dim_embs_sis[:,0],
y=low_dim_embs_sis[:,1]))
hover = HoverTool()
hover.point_policy = "follow_mouse"
hover = HoverTool(
tooltips=[
("words_bros", "@words_bros"),
("words_sis", "@words_sis"),
]
)
TOOLS="pan,wheel_zoom,box_zoom,reset,save"
p = figure(title = "Word visualization", tools=[TOOLS, hover])
p.circle('x', 'y', source=source_bros, fill_alpha=0.2, size=10, color='navy')
p.circle('x', 'y', source=source_sis, fill_alpha=0.2, size=10, color='red')
show(p)
return set(unseen_words_bros + unseen_words_sis)
In [25]:
unseen_words = plot_tsne(glove_100, 1000)
In [26]:
print(', '.join(unseen_words))
In [27]:
def tokenize_dataset(tokenizer, dataset):
tokenize_dataset = []
for tweet in dataset:
# Replace different numbers with a token
tweet = re.sub(r"\.\d+\s*", ".<number> ", tweet)
tweet = re.sub(r"\d+\s*", " <number> ", tweet)
tokens = tokenizer(tweet)
tokenize_dataset.append(tokens)
return tokenize_dataset
In [28]:
X_train_tokenized = tokenize_dataset(TweetTokenizer().tokenize, X_train)
X_test_tokenized = tokenize_dataset(TweetTokenizer().tokenize, X_test)
In [29]:
def get_embeddings(dataset, dictionary, embedding_size):
X_emebeddings = []
for tweet in dataset:
tweet_embeddings = []
for word in tweet:
if word in dictionary:
tweet_embeddings.append(dictionary[word])
if not tweet_embeddings:
tweet_embeddings.append(np.zeros(embedding_size))
# Each tweet would have a different number of words and ML techniques requiere fixed inputs.
X_emebeddings.append(np.mean(np.asarray(tweet_embeddings, dtype=np.float32), axis=0))
return X_emebeddings
In [30]:
X_train_GloVe = get_embeddings(X_train_tokenized, glove_100, 100)
X_test_GloVe = get_embeddings(X_test_tokenized, glove_100, 100)
In [31]:
from gensim.models import word2vec
In [32]:
# Initialize and train the model (this will take some time)
model = word2vec.Word2Vec(X_train_tokenized,
workers = 4,
size = 100,
min_count = 1, # How many times a word should appear to be taken into account
window = 5,
sample = 1e-3 , # Downsample setting for frequent words
batch_words = 100) # Batches of examples passed to worker threads
# This model won't be updated
model.init_sims(replace=True)
model_name = "word2vec"
model.save(model_name)
In [33]:
model.syn0.shape
Out[33]:
In [34]:
_ = plot_tsne(model, 1000)
In [35]:
model.most_similar("python")
Out[35]:
In [36]:
X_train_word2vec = get_embeddings(X_train_tokenized, model, 100)
X_test_word2vec = get_embeddings(X_test_tokenized, model, 100)
In [37]:
from sklearn import svm
from sklearn.metrics import classification_report, roc_curve, auc
In [38]:
clf = svm.SVC()
clf.fit(BOW_train, y_train)
predicction_BOW = clf.predict(BOW_test)
target_names = ['Bros', 'Sis']
print(classification_report(y_test, predicction_BOW, target_names=target_names))
In [39]:
clf = svm.SVC()
clf.fit(X_train_GloVe, y_train)
predicction_GloVe = clf.predict(X_test_GloVe)
print(classification_report(y_test, predicction_GloVe, target_names=target_names))
In [40]:
clf = svm.SVC()
clf.fit(X_train_word2vec, y_train)
predicction_word2vec = clf.predict(X_test_word2vec)
print(classification_report(y_test, predicction_word2vec, target_names=target_names))
In [41]:
false_positive_rate_bow, true_positive_rate_bow, _ = roc_curve(y_test, predicction_BOW)
roc_auc_bow = auc(false_positive_rate_bow, true_positive_rate_bow)
false_positive_rate_glove, true_positive_rate_glove, _ = roc_curve(y_test, predicction_GloVe)
roc_auc_glove = auc(false_positive_rate_glove, true_positive_rate_glove)
false_positive_rate_w2v, true_positive_rate_w2v, _ = roc_curve(y_test, predicction_word2vec)
roc_auc_w2v = auc(false_positive_rate_w2v, true_positive_rate_w2v)
In [42]:
from bokeh.palettes import Spectral6
p = figure(title="Receiver Operating Characteristic", tools=TOOLS)
p.line(false_positive_rate_bow, true_positive_rate_bow, legend='BoW ROC curve (area = {:.2f})'.format(roc_auc_bow),
line_color="green", line_width=2)
p.line(false_positive_rate_glove, true_positive_rate_glove,
legend='GloVE ROC curve (area = {:.2f})'.format(roc_auc_glove),
line_color="blue", line_width=2)
p.line(false_positive_rate_w2v, true_positive_rate_w2v,
legend='W2V ROC curve (area = {:.2f})'.format(roc_auc_w2v),
line_color="yellow", line_width=2)
p.line([0.0, 1.0], [0.0, 1.05], legend='Guessing',
line_color="gray", line_width=2, line_dash=(4, 4))
p.xaxis.axis_label = 'False Positive Rate'
p.yaxis.axis_label = 'True Positive Rate'
p.legend.location = 'bottom_right'
show(p)
Out[42]:
If you have guessed that maybe the mean is not the best option for representing sentences you may want to use a Neural Network toolkit try Layers.
And finally ...